{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "49f6703b-abb7-4f56-a0f4-acfdf0ea71b9",
   "metadata": {},
   "source": [
    "# 任务2：ChatGLM API使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5d7b1457-fc1e-4cc3-9567-18e5537b7683",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import jwt\n",
    "import requests\n",
    "from itertools import combinations\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 实际KEY，过期时间\n",
    "def generate_token(apikey: str, exp_seconds: int):\n",
    "    try:\n",
    "        id, secret = apikey.split(\".\")\n",
    "    except Exception as e:\n",
    "        raise Exception(\"invalid apikey\", e)\n",
    "\n",
    "    payload = {\n",
    "        \"api_key\": id,\n",
    "        \"exp\": int(round(time.time() * 1000)) + exp_seconds * 1000,\n",
    "        \"timestamp\": int(round(time.time() * 1000)),\n",
    "    }\n",
    "    return jwt.encode(\n",
    "        payload,\n",
    "        secret,\n",
    "        algorithm=\"HS256\",\n",
    "        headers={\"alg\": \"HS256\", \"sign_type\": \"SIGN\"},\n",
    "    )\n",
    "\n",
    "def ask_glm(question, nretry=5):\n",
    "    if nretry == 0:\n",
    "        return None\n",
    "\n",
    "    url = \"https://open.bigmodel.cn/api/paas/v4/chat/completions\"\n",
    "    headers = {\n",
    "      'Content-Type': 'application/json',\n",
    "      'Authorization': generate_token(\"7bf001734ef2fd7f7a55bf51dadd7cbb.BMAsoKRDFTmTEPwj\", 1000)\n",
    "    }\n",
    "    data = {\n",
    "        \"model\": \"glm-3-turbo\",\n",
    "        \"p\": 0.5,\n",
    "        \"messages\": [{\"role\": \"user\", \"content\": question}]\n",
    "    }\n",
    "    try:\n",
    "        response = requests.post(url, headers=headers, json=data, timeout=10)\n",
    "        return response.json()\n",
    "    except:\n",
    "        return ask_glm(question, nretry-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f99088fa-e683-4c51-947d-347b3f14763d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def regex_agent(question):\n",
    "    prompt_template = '''你是一个专业的python的工程师，擅长编写各种的正则表达式。将下面的要求转换为正则匹配表达式，只需要输出表达式，不要有其他的输出。\n",
    "{0}\n",
    "    '''.format(question)\n",
    "\n",
    "    return ask_glm(prompt_template)['choices'][0]['message']['content']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "517e087f-5f47-4d27-b0c4-ed49e3657db5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "```\n",
      "\\b[A-Z][a-z]*\\b\n",
      "```\n"
     ]
    }
   ],
   "source": [
    "print(regex_agent(\"识别首字母大写单词的正则\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "3e591f76-4f28-449f-9a9b-cdd1c5ba653f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "`^[A-Z][a-zA-Z]{0,9}$`\n"
     ]
    }
   ],
   "source": [
    "print(regex_agent(\"识别首字母大写且字符个数小于10的正则\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "59753dea-a85c-46c4-b821-77540091357f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\b\\w+[.,;!?]$\n"
     ]
    }
   ],
   "source": [
    "print(regex_agent(\"识别单词末尾为标点符号的正则\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4d21bbd-3ca6-4a0b-b9b3-e407bedf1304",
   "metadata": {},
   "source": [
    "# 任务3：数据库内容解析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "f514324e-9ee1-4c7a-b1b8-cbf546cee2f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''数据库解析'''\n",
    "from typing import Union\n",
    "import traceback\n",
    "from sqlalchemy import create_engine, inspect, func, select, Table, MetaData\n",
    "import pandas as pd\n",
    "\n",
    "class DBParser:\n",
    "    '''DBParser'''\n",
    "    def __init__(self, db_url:str) -> None:\n",
    "        '''初始化\n",
    "        db_url: 数据库链接地址\n",
    "        '''\n",
    "\n",
    "        # 判断数据库类型\n",
    "        if 'sqlite' in db_url:\n",
    "            self.db_type = 'sqlite'\n",
    "        elif 'mysql' in db_url:\n",
    "            self.db_type = 'mysql'\n",
    "\n",
    "        # 链接数据库\n",
    "        self.engine = create_engine(db_url, echo=False)\n",
    "        self.conn = self.engine.connect()\n",
    "        self.db_url = db_url\n",
    "\n",
    "        # 查看表明\n",
    "        self.inspector = inspect(self.engine)\n",
    "        self.table_names = self.inspector.get_table_names()\n",
    "\n",
    "        self._table_fields = {} # 数据表字段\n",
    "        self.foreign_keys = [] # 数据库外键\n",
    "        self._table_sample = {} # 数据表样例\n",
    "\n",
    "        # 依次对每张表的字段进行统计\n",
    "        for table_name in self.table_names:\n",
    "            print(\"Table ->\", table_name)\n",
    "            self._table_fields[table_name] = {}\n",
    "\n",
    "            # 累计外键\n",
    "            self.foreign_keys += [\n",
    "                {\n",
    "                    'constrained_table': table_name,\n",
    "                    'constrained_columns': x['constrained_columns'],\n",
    "                    'referred_table': x['referred_table'],\n",
    "                    'referred_columns': x['referred_columns'],\n",
    "                } for x in self.inspector.get_foreign_keys(table_name)\n",
    "            ]\n",
    "\n",
    "            # 获取当前表的字段信息\n",
    "            table_instance = Table(table_name, MetaData(), autoload=True, autoload_with=self.engine)\n",
    "            table_columns = self.inspector.get_columns(table_name)\n",
    "            self._table_fields[table_name] = {x['name']:x for x in table_columns}\n",
    "\n",
    "            # 对当前字段进行统计\n",
    "            for column_meta in table_columns:\n",
    "                # 获取当前字段\n",
    "                column_instance = getattr(table_instance.columns, column_meta['name'])\n",
    "\n",
    "                # 统计unique\n",
    "                query = select(func.count(func.distinct(column_instance)))\n",
    "                distinct_count = self.conn.execute(query).fetchone()[0]\n",
    "                self._table_fields[table_name][column_meta['name']]['distinct'] = distinct_count\n",
    "\n",
    "                # 统计most frequency value\n",
    "                field_type = self._table_fields[table_name][column_meta['name']]['type']\n",
    "                field_type = str(field_type)\n",
    "                if 'text' in field_type.lower() or 'char' in field_type.lower():\n",
    "                    query = (\n",
    "                        select([column_instance, func.count().label('count')])\n",
    "                        .group_by(column_instance)\n",
    "                        .order_by(func.count().desc())\n",
    "                        .limit(1)\n",
    "                    )\n",
    "                    top1_value = self.conn.execute(query).fetchone()[0]\n",
    "                    self._table_fields[table_name][column_meta['name']]['mode'] = top1_value\n",
    "\n",
    "                # 统计missing个数\n",
    "                query = select(func.count()).filter(column_instance == None)\n",
    "                nan_count = self.conn.execute(query).fetchone()[0]\n",
    "                self._table_fields[table_name][column_meta['name']]['nan_count'] = nan_count\n",
    "\n",
    "                # 统计max\n",
    "                query = select(func.max(column_instance))\n",
    "                max_value = self.conn.execute(query).fetchone()[0]\n",
    "                self._table_fields[table_name][column_meta['name']]['max'] = max_value\n",
    "\n",
    "                # 统计min\n",
    "                query = select(func.min(column_instance))\n",
    "                min_value = self.conn.execute(query).fetchone()[0]\n",
    "                self._table_fields[table_name][column_meta['name']]['min'] = min_value\n",
    "\n",
    "                # 任意取值\n",
    "                query = select(column_instance).limit(10)\n",
    "                random_value = self.conn.execute(query).all()\n",
    "                random_value = [x[0] for x in random_value]\n",
    "                random_value = [str(x) for x in random_value if x is not None]\n",
    "                random_value = list(set(random_value))\n",
    "                self._table_fields[table_name][column_meta['name']]['random'] = random_value[:3]\n",
    "\n",
    "            # 获取表样例（第一行）\n",
    "            query = select([table_instance])\n",
    "            self._table_sample[table_name] = pd.DataFrame([self.conn.execute(query).fetchone()])\n",
    "            self._table_sample[table_name].columns = [x['name'] for x in table_columns]\n",
    "\n",
    "\n",
    "    def get_table_fields(self, table_name) -> pd.DataFrame:\n",
    "        '''获取表字段信息'''\n",
    "        return pd.DataFrame.from_dict(self._table_fields[table_name]).T\n",
    "\n",
    "    def get_data_relations(self) -> pd.DataFrame:\n",
    "        '''获取数据库链接信息（主键和外键）'''\n",
    "        return pd.DataFrame(self.foreign_keys)\n",
    "\n",
    "    def get_table_sample(self, table_name) -> pd.DataFrame:\n",
    "        '''获取数据表样例'''\n",
    "        return self._table_sample[table_name]\n",
    "\n",
    "    def check_sql(self, sql) -> Union[bool, str]:\n",
    "        '''检查sql是否合理\n",
    "\n",
    "        参数\n",
    "            sql: 待执行句子\n",
    "\n",
    "        返回: 是否可以运行 报错信息\n",
    "        '''\n",
    "        try:\n",
    "            self.engine.execute(sql)\n",
    "            return True, 'ok'\n",
    "        except:\n",
    "            err_msg = traceback.format_exc()\n",
    "            return False, err_msg\n",
    "\n",
    "    def execute_sql(self, sql) -> bool:\n",
    "        '''运行SQL'''\n",
    "        result = self.engine.execute(sql)\n",
    "        return list(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "2c60defc-e7e6-471e-99d4-7847bfa5837a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Table -> A股公司行业划分表\n",
      "Table -> A股票日行情表\n",
      "Table -> 基金份额持有人结构\n",
      "Table -> 基金债券持仓明细\n",
      "Table -> 基金可转债持仓明细\n",
      "Table -> 基金基本信息\n",
      "Table -> 基金日行情表\n",
      "Table -> 基金股票持仓明细\n",
      "Table -> 基金规模变动表\n",
      "Table -> 港股票日行情表\n"
     ]
    }
   ],
   "source": [
    "parser = DBParser('sqlite:///./bs_challenge_financial_14b_dataset/dataset/博金杯比赛数据.db')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5160a47c-0825-4753-8b1b-5a0838bb2e5b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['A股公司行业划分表',\n",
       " 'A股票日行情表',\n",
       " '基金份额持有人结构',\n",
       " '基金债券持仓明细',\n",
       " '基金可转债持仓明细',\n",
       " '基金基本信息',\n",
       " '基金日行情表',\n",
       " '基金股票持仓明细',\n",
       " '基金规模变动表',\n",
       " '港股票日行情表']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parser.table_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8e509016-fa5c-424c-ae4c-f9aa5a43395e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>nullable</th>\n",
       "      <th>default</th>\n",
       "      <th>autoincrement</th>\n",
       "      <th>primary_key</th>\n",
       "      <th>distinct</th>\n",
       "      <th>mode</th>\n",
       "      <th>nan_count</th>\n",
       "      <th>max</th>\n",
       "      <th>min</th>\n",
       "      <th>random</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>股票代码</th>\n",
       "      <td>股票代码</td>\n",
       "      <td>TEXT</td>\n",
       "      <td>True</td>\n",
       "      <td>None</td>\n",
       "      <td>auto</td>\n",
       "      <td>0</td>\n",
       "      <td>5205</td>\n",
       "      <td>990018</td>\n",
       "      <td>0</td>\n",
       "      <td>990018</td>\n",
       "      <td>000001</td>\n",
       "      <td>[002549, 000065, 002561]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>交易日期</th>\n",
       "      <td>交易日期</td>\n",
       "      <td>TEXT</td>\n",
       "      <td>True</td>\n",
       "      <td>None</td>\n",
       "      <td>auto</td>\n",
       "      <td>0</td>\n",
       "      <td>1096</td>\n",
       "      <td>20211231</td>\n",
       "      <td>0</td>\n",
       "      <td>20211231</td>\n",
       "      <td>20190101</td>\n",
       "      <td>[20190120, 20190121, 20190125]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>行业划分标准</th>\n",
       "      <td>行业划分标准</td>\n",
       "      <td>TEXT</td>\n",
       "      <td>True</td>\n",
       "      <td>None</td>\n",
       "      <td>auto</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>申万行业分类</td>\n",
       "      <td>0</td>\n",
       "      <td>申万行业分类</td>\n",
       "      <td>中信行业分类</td>\n",
       "      <td>[中信行业分类]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>一级行业名称</th>\n",
       "      <td>一级行业名称</td>\n",
       "      <td>TEXT</td>\n",
       "      <td>True</td>\n",
       "      <td>None</td>\n",
       "      <td>auto</td>\n",
       "      <td>0</td>\n",
       "      <td>45</td>\n",
       "      <td>电子</td>\n",
       "      <td>309830</td>\n",
       "      <td>餐饮旅游</td>\n",
       "      <td>交通运输</td>\n",
       "      <td>[电力设备及新能源, 交通运输, 建材]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>二级行业名称</th>\n",
       "      <td>二级行业名称</td>\n",
       "      <td>TEXT</td>\n",
       "      <td>True</td>\n",
       "      <td>None</td>\n",
       "      <td>auto</td>\n",
       "      <td>0</td>\n",
       "      <td>221</td>\n",
       "      <td>None</td>\n",
       "      <td>1974149</td>\n",
       "      <td>黑色家电Ⅱ</td>\n",
       "      <td>IT服务</td>\n",
       "      <td>[一般零售, 建筑施工Ⅱ, 公交物流]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          name  type nullable default autoincrement primary_key distinct  \\\n",
       "股票代码      股票代码  TEXT     True    None          auto           0     5205   \n",
       "交易日期      交易日期  TEXT     True    None          auto           0     1096   \n",
       "行业划分标准  行业划分标准  TEXT     True    None          auto           0        2   \n",
       "一级行业名称  一级行业名称  TEXT     True    None          auto           0       45   \n",
       "二级行业名称  二级行业名称  TEXT     True    None          auto           0      221   \n",
       "\n",
       "            mode nan_count       max       min                          random  \n",
       "股票代码      990018         0    990018    000001        [002549, 000065, 002561]  \n",
       "交易日期    20211231         0  20211231  20190101  [20190120, 20190121, 20190125]  \n",
       "行业划分标准    申万行业分类         0    申万行业分类    中信行业分类                        [中信行业分类]  \n",
       "一级行业名称        电子    309830      餐饮旅游      交通运输            [电力设备及新能源, 交通运输, 建材]  \n",
       "二级行业名称      None   1974149     黑色家电Ⅱ      IT服务             [一般零售, 建筑施工Ⅱ, 公交物流]  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parser.get_table_fields(\"A股公司行业划分表\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "b2d0d564-b313-4bc9-8cef-3b1f880bc2df",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sql_agent(table_name, table_info, question):\n",
    "    prompt_template = '''你是一个sql专家，基于已有的表格信息，请将下面的问题转换为sql查询语句。直接输出sql，不要输出其他内容。\n",
    "表名称：{0}\n",
    "\n",
    "表格信息：\n",
    "{1}\n",
    "\n",
    "待查询问题：{2}\n",
    "'''.format(table_name, table_info, question)\n",
    "\n",
    "    return ask_glm(prompt_template)['choices'][0]['message']['content']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "5044d6bf-4776-4f1a-9211-de88049d23e3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'```sql\\nSELECT COUNT(DISTINCT 股票代码) FROM A股公司行业划分表;\\n```'"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sql_agent(\n",
    "    \"A股公司行业划分表\",\n",
    "    parser.get_table_fields(\"A股公司行业划分表\").to_markdown(),\n",
    "    '查询下总共有多少个股票'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "9690b2f8-048f-43b2-9285-57181f4a4fb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = sql_agent(\n",
    "    \"A股公司行业划分表\",\n",
    "    parser.get_table_fields(\"A股公司行业划分表\").to_markdown(),\n",
    "    '查询下总共有多少个股票'\n",
    ")\n",
    "sql = sql.replace('```sql', '').replace('```', '').strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "cb15b255-ea13-46d3-a625-2b34deca4770",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(5205,)]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parser.execute_sql(sql)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f75fc882-91c6-47b3-b3a8-0b8dada0d374",
   "metadata": {},
   "source": [
    "# 任务4：文本索引与答案检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "8aaa6ac1-0822-4e06-b67d-18146c9262be",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['A股公司行业划分表',\n",
       " 'A股票日行情表',\n",
       " '基金份额持有人结构',\n",
       " '基金债券持仓明细',\n",
       " '基金可转债持仓明细',\n",
       " '基金基本信息',\n",
       " '基金日行情表',\n",
       " '基金股票持仓明细',\n",
       " '基金规模变动表',\n",
       " '港股票日行情表']"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parser.table_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "3f5cdbf0-0aef-4a0f-84aa-efdf90529542",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pdfplumber  # 导入pdfplumber模块，用于处理PDF文件\n",
    "import glob\n",
    "import random\n",
    "import json\n",
    "import numpy as np\n",
    "\n",
    "# !pip install rank_bm25\n",
    "from rank_bm25 import BM25Okapi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "0e91414e-cf9a-4471-957c-c3437273d9c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt_for_company_name = '''你是一个专业的识别公司名称的文本处理专家，请识别下面文档中的公司名称。直接输出公司的中文名称，不要有其他输出。\n",
    "{0}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "cf672d52-1a49-45b9-b3b8-fb89f385b695",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./bs_challenge_financial_14b_dataset/pdf/2389de12d78fe1ca4fa24910e6b1573902098bc3.PDF 深圳市铁汉生态环境股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/50b2823371fe1699d260f67cadac3d38af0672e3.PDF 浙江富春江环保热电股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/b96b11328c7bdc32b63fd15b1d1f96759ed94add.PDF 烟台杰瑞石油服务集团股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/54c7b3ab01ad11d37835a4464c9e4d68dfe6306f.PDF 云南沃森生物技术股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/044635429bd83e329c5047010121044e07568feb.PDF 上海华铭智能终端设备股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/810d8681429537dbf20a437a6dbf08e34c1ece27.PDF 浙江百达精工股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/e0a89442ee14a193c6600b0092a43a95a2ddab88.PDF 华达汽车科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/bec898d3079074d88f8bdb34d7e07f072cfca695.PDF 中国神华能源股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/398c8e64f18a13e695b5956122ef2f6a6fd3b274.PDF 上海真兰仪表科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/f7a306fea164e32f1df88b7beb8ebba8f65dd7a1.PDF 苏州海陆重工股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/6d2d79af9644ba5fef3b462fa40139c1395a3552.PDF 深圳市超频三科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/9951262c20ed33562a2fff85c83aeae320f14922.PDF 东莞勤上光电股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/edde146c2435590bc33de0f144c04a7d7bf4d5b5.PDF 新疆金风科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/5e019307c43efbd95dea0ab110808ebeba819776.PDF 义乌华鼎锦纶股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/42518828d97dd45ac34dc34a5814d18c1ebe9a83.PDF 宁波立立电子股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/91b4426b075560a1a45247f9cfa9fa73d56c945c.PDF 广州中中海海达达卫卫星星导导航航技技术术股股份份有有限限公公司司\n",
      "./bs_challenge_financial_14b_dataset/pdf/c48b454b4d171ba375fb88a6469b0a1d7f4625c2.PDF 浙江金卡高科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/756171248e278806a56171d59c6519a38eac9012.PDF 确成硅化学股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/f8fb28932337c061edb6f0a1b27f5a1f0ee0b957.PDF 金石资源集团股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/28560e1383141e35127388a3f0ca0e7b24919c17.PDF 江苏旷达汽车织物集团股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/e6ff749bb533a47173aaca91fe5d44080d9d37b3.PDF 广东银禧科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/0b46f7a2d67b5b59ad67cafffa0e12a9f0837790.PDF 武汉力源信息技术股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/ad2bf1b94db9ec7cb2689ace2daec396d2965dce.PDF 厦门安妮股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/8feb69d0cda9f9fbfb29968f9aeb4cbf1953edab.PDF 瑞达期货股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/749dc57b0a41b931f3fbcff1ec4f808ff460655e.PDF 威龙葡萄酒股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/4170a3382f5e92d32890780d79cb39115a756f40.PDF 中国铁路通信信号股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/a6f8156c08a1096c46470a1c5e1229daaaedf06e.PDF 厚普股份\n",
      "./bs_challenge_financial_14b_dataset/pdf/d1a73ca4a204ca3f98024bca8ce0652f5e1c1a94.PDF 贵州永吉印务股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/9b877e7db4809637c56eea234eaddc74cb20f9da.PDF 浙江亚厦装饰股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/fb468fe2d4d474b9fb94952d881f2218a871779c.PDF 江苏爱康太阳能科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/f30bfe8be4ad535d348d74f80eaef8d93b3c8ac5.PDF 上海维科精密模塑股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/43f9d961f0885608850dec7f4284afdaf07bdc43.PDF 深圳可立克科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/11463b6562c6ba2afde67a29bca314452ea9a2e0.PDF 国电电力发展股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/334a4deb4169c220c7a89243a104f6874ce1996a.PDF 浙江双飞无油轴承股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/44850fcc20f00a48f5bca0ad4acde91d76b839d6.PDF 西安启源机电装备股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/3e0ded8afa8f8aa952fd8179b109d6e67578c2dd.PDF 华瑞电器股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/afa8c5a4a91c3ecf7bd38a1c1f09b8a68e472909.PDF 海看网络科技（山东）股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/5ad8119f0928b041ed0f0ecf3ef4164cd64ad57c.PDF 上海宏英智能科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/6c93724343cef6b9da70d51daa739e8a1c607a22.PDF 珠海健帆生物科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/c0f513c24a7ad4d74a3e2aab7c749f64c50771d2.PDF 海南双成药业股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/47d0efaf57fb6f5d66af5ecdc422dd1da99b6d95.PDF 江苏天瑞仪器股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/54d148902b889679830174597830f0d0f22c1073.PDF 上海派能能源科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/cae2f50f92120710bea7cf31b69f1a14e7badeb6.PDF 四川高金食品股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/03c625c108ac0137f413dfd4136adb55c74b3805.PDF 安徽黄山胶囊股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/c3eae2913ce6c9f56c58a5588358740a00d47dd5.PDF 新疆浩源天然气股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/421864a100b2b65568d0aa7e08bdc9e13e5c91d2.PDF 东港安全印刷股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/9dca16666143beb8c15a6a4ba6ec2b19b2ffb8fb.PDF 内蒙古古君君正正正能源源源化化化工工工股股股份份份有有有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/952121f38a5eece19cca7cdc39079b43438fad88.PDF 浙江步森服饰股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/a0d278016e9baded15e0f5b3964563e525b1b787.PDF 武汉兴图新科电子股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/7ee0d82749433aba584ff8eabfc1575580aa0d97.PDF 深圳麦格米特电气股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/4bc783ca9fd53beb4be3a79b1712d5e42be209e1.PDF 北京天宜上佳高新材料股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/b58b0a129b2c62ca8b295d40c91134df5d5e603d.PDF 山东国瓷功能材料股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/b29a4948c1ba14e1a85f8339dd5bb0fcabadb171.PDF 浙江贝因美科工贸股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/96b461d6c6670928f7dc36f0c947e0c18340d5e2.PDF 财富证券有限责任公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/43571629ce9fb30b44ccefdca9f637102c16d1a2.PDF 天津力生制药股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/07d29cd67ca8e0fc932e05178db1fcdca1cee937.PDF 香港交易及結算所有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/1af01c24b0957a1ab6d9262b93d7ccf7093f0529.PDF 昇兴集团股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/d627620bedd94ea27c3840fa974cef3f7b582d03.PDF 深圳市建筑科学研究院股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/08538049ad1c7ca74c508bf95fe9fbbb4c924f3e.PDF 海尔施生物医药股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/eec23035376ae0e339a7643402fdbdccd92ad703.PDF 湖南长远锂科股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/b9587835ff0d27df936b1f623b412e88e5e3abad.PDF 汉嘉设计集团股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/79e9167d0336a9f471ab77dc04ec978fc4a89171.PDF 大博医疗科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/f587290218d881e18e88fc1431b022b2c5aca81a.PDF 苏州东微半导体股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/8c130bea710816d5ef9a8bd7b2df7ee46e9f40dd.PDF 浙江开尔新材料股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/d6867ff3ecf9882065e45612eaf7b33ccb9a95dc.PDF 兰州海默科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/e1e8e0551703f4a1f2873755ef46210b0889c5cd.PDF 读者出版传媒股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/fa3eaf37dadb2a7495e3f774434f8347ce5c51c8.PDF 北京银行\n",
      "./bs_challenge_financial_14b_dataset/pdf/9c4a118cf576f91aed791fb6fe6926180e2dcc65.PDF 南京市的公司名称无法从所给信息中直接识别出来，因为文档中没有明确的公司名称。仅提供了地址信息。\n",
      "./bs_challenge_financial_14b_dataset/pdf/88acba2e98dedd85fdd38192a639dbcc56faf3ed.PDF 森赫电梯股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/d336d607e1d431cbfe1f313e2234a13fcf49a16e.PDF 湖南国科微电子股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/ba561001c2abd0714c413e2f6a77ffee58c644a6.PDF 联化科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/ca1191af7558e0f18966e4df589368a3a5f5e1e6.PDF 青海互助青稞酒股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/72103a0ef6299585cdd09751682534d873f8e64e.PDF 北京七星华创电子股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/5c2707b186a798118bba251cce8d1fc331b5cbba.PDF 厦门科华恒盛股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/f0e86e33d654c797626767e660bb234ff469bee8.PDF 大连派思燃气系统股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/a244b6ed9da7411f804e62b82a8fdfc015dff284.PDF 惠州光弘科技股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/d6c70c04287ff6aba75b0565bef375469aaa68c3.PDF 常熟风范电力设备股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/52fd4a323d97356a463959671794e900699baae5.PDF 深圳信立泰药业股份有限公司\n",
      "./bs_challenge_financial_14b_dataset/pdf/e774a06e6b4db734424f7d9181b9515a08bea6cc.PDF 中钢集团\n",
      "./bs_challenge_financial_14b_dataset/pdf/f9e84ce0edd5279773b3ca1f36a9e39d6ceaf220.PDF 杭州中恒电气股份有限公司\n"
     ]
    }
   ],
   "source": [
    "pdf_company_list = []\n",
    "for pdf_path in glob.glob('./bs_challenge_financial_14b_dataset/pdf/*'):\n",
    "    pdf = pdfplumber.open(pdf_path)\n",
    "    for i in range(10):\n",
    "        first_page_content = pdf.pages[i].extract_text()\n",
    "        first_page_content = first_page_content.strip()\n",
    "        if len(first_page_content) > 0:\n",
    "            break\n",
    "    company_name = ask_glm(prompt_for_company_name.format(first_page_content))['choices'][0]['message']['content']\n",
    "    company_name = company_name.split(' ')[0].split('\\n')[0]\n",
    "    print(pdf_path, company_name)\n",
    "    pdf_company_list.append([pdf_path, company_name])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "01b0cb8b-1005-4fcc-ba81-f01c06b898d3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': '在2019年1月15日，根据中信行业分类标准，能否帮我查询一下有多少家A股公司的股票代码是000065？相关的数据可以在A股公司行业划分表里找到，谢谢。',\n",
       " 'answer': '',\n",
       " 'reference': ''}"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "questions = json.load(open('博金杯金融问答_query.json'))\n",
    "questions[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "8e5f3e6b-3817-4c92-9448-b56467b5f0e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "reference_words = [list(x) for x in parser.table_names]\n",
    "reference_words += [list(x[1]) for x in pdf_company_list]\n",
    "reference_path = parser.table_names + [x[0].split('/')[-1] for x in pdf_company_list]\n",
    "bm25 = BM25Okapi(reference_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "64f2e171-798a-454a-ba09-53973b32298e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for query_idx in range(len(questions)):\n",
    "    doc_scores = bm25.get_scores(list(questions[query_idx][\"question\"]))\n",
    "    max_score_page_idx = doc_scores.argsort()[::-1][0]\n",
    "    questions[query_idx]['reference'] = reference_path[max_score_page_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "e5eee695-c564-48a6-aa2b-1db9e8c05fa0",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('submit_reference_bm25.json', 'w', encoding='utf8') as up:\n",
    "    json.dump(questions, up, ensure_ascii=False, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92e4ae95-26fa-489e-97ec-323777f39a55",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
